#!/usr/bin/python
# -*- coding:utf-8 -*-
import time

import sys

from html_download import Downloader
from html_parser import Parser
from url_manager import UrlManager
from html_outputer import HtmlOuter


class SpiderMain(object):
    def __init__(self):
        self.downloader = Downloader()
        self.parser = Parser()
        self.url_manager = UrlManager()
        self.outer = HtmlOuter()

    def run(self, base_url):
        self.url_manager.add_url(base_url)

        count = 0

        while self.url_manager.has_url():
            count += 1
            url = self.url_manager.get_url()
            print '%d open url:%s' % (count, url)
            html = self.downloader.do_download(url)
            if html is None:
                continue
            links, question = self.parser.parser(html, url)
            self.url_manager.add_urls(links)
            if question:
                self.outer.write(question)
            time.sleep(10)  # 模拟用户访问，正常用户不会在一秒钟之内打开十几个网页


if __name__ == '__main__':
    # 设置编码为u8
    reload(sys)
    sys.setdefaultencoding("utf-8")

    root_url = 'https://www.zhihu.com'
    spider = SpiderMain()
    spider.run(root_url)
